import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
## Import the data file and remove columns that aren't needed ##
data = pd.read_csv('weatherDataAll.csv', sep=",")
# Columns in dataset #
#Average Dew Point - DEWP
#Average Sea Level Pressure - SLP
#Average Station Pressure - STP
#Average Temperature - TEMP
#Average Visibility - VISIB
#Average Wind Speed - WDSP
#Indicators
#Maximum Sustained Wind Speed - MXSPD
#Maximum Temperature - MAX
#Maximum Wind Gust - GUST
#Minimum Temperature - MIN
#Precipitation - PRCP
#Snow Depth - SNDP
data.head()
# drop un-needed columns in the dataset that won't really affect my linear regression prediction #
drop_col = [0, 2, 3, 7, 9, 11, 13, 15, 17, 21, 23, 25, 27]
data.drop(data.columns[drop_col],axis=1,inplace=True)
data.head()
## First thing to do is remove any rows that have a missing mean temperature value ##
print(data.shape)
rows = []
for d in range(0,len(data["TEMP"])):
if(data["TEMP"][d] == 9999.9):
rows.append(d)
data.drop(data.index[rows], inplace=True)
print(data.shape)
# change all missing/not reported precipitation to zero '99.99' -> '0.00'
for d in range(0,len(data["PRCP"])):
if(99.99 == data["PRCP"][d]):
data.at[d, 'PRCP'] = 0.00
data['PRCP'] = pd.to_numeric(data['PRCP'])
data.head()
# change all missing/not reported snow depth to zero '999.99' -> '0.00'
for d in range(0,len(data["SNDP"])):
if(999.9 == data["SNDP"][d]):
data.at[d, 'SNDP'] = 0.00
data['SNDP'] = pd.to_numeric(data['SNDP'])
data.head()
## Clean up Mean Sea Level Pressure (SLP) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["SLP"])):
if(data["SLP"][d] == 9999.9):
values_to_change.append(d)
else:
row_sum += data["SLP"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "SLP"]= str(row_sum/N)
data.head()
## Clean up Average Wind Speed (WDSP) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["WDSP"])):
if(data["WDSP"][d] == 999.9):
values_to_change.append(d)
else:
row_sum += data["WDSP"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "WDSP"]= str(row_sum/N)
data.head()
## Clean up Max Wind GUST (GUST) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["GUST"])):
if(data["GUST"][d] == 999.9):
values_to_change.append(d)
else:
row_sum += data["GUST"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "GUST"]= str(row_sum/N)
data.head()
## Clean up Max Wind Speed (MXSPD) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["MXSPD"])):
if(data["MXSPD"][d] == 999.9):
values_to_change.append(d)
else:
row_sum += data["MXSPD"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "MXSPD"]= str(row_sum/N)
data.head()
## Clean up Mean Dew Point (DEWP) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["DEWP"])):
if(data["DEWP"][d] == 9999.9):
values_to_change.append(d)
else:
row_sum += data["DEWP"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "DEWP"]= str(row_sum/N)
data.head()
## Clean up Mean Station Pressure (STP) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["STP"])):
if(data["STP"][d] == 999.9):
values_to_change.append(d)
else:
row_sum += data["STP"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "STP"]= str(row_sum/N)
data.head()
## Clean up Average Visibility (VISIB) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["VISIB"])):
if(data["VISIB"][d] == 999.9):
values_to_change.append(d)
else:
row_sum += data["VISIB"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "VISIB"]= str(row_sum/N)
data.head()
## Clean up Min Temp (Min) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["MIN"])):
if(data["MIN"][d] == 9999.9):
values_to_change.append(d)
else:
row_sum += data["MIN"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "MIN"]= str(row_sum/N)
data.head()
## Clean up Max Temp (Max) ##
rows = []
N = 0
row_sum = 0
values_to_change = []
for d in range(0,len(data["MAX"])):
if(data["MAX"][d] == 9999.9):
values_to_change.append(d)
else:
row_sum += data["MAX"][d]
#rows.append(d)
N+=1
for d in range(0, len(values_to_change)):
data.at[values_to_change[d], "MAX"]= str(row_sum/N)
data.head()
sns.boxplot(x=data['TEMP'])
sns.boxplot(x=data['ELEVATION'])
sns.boxplot(x=data['DEWP'])
sns.boxplot(x=data['SLP'])
sns.boxplot(x=data['STP'])
sns.boxplot(x=data['VISIB'])
sns.boxplot(x=data['WDSP'])
sns.boxplot(x=data['MXSPD'])
sns.boxplot(x=data['GUST'])
sns.boxplot(x=data['MAX'])
sns.boxplot(x=data['MIN'])
sns.boxplot(x=data['PRCP'])
data['PRCP'].mean()
sns.boxplot(x=data['SNDP'])
# Now that I understand there are outliers I need to check where exactly these outliers
# using the IQR method with this article I followed:
#https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
#Call describe on df and transpose it due to the large number of columns
outs = data.describe().T
# precalculate interquartile range for ease of use in next calculation
IQR = outs['75%'] - outs['25%']
# create an outliers column which is either 3 IQRs below the first quartile or
# 3 IQRs above the third quartile
outs['outliers'] = (outs['min']<(outs['25%']-(3*IQR)))|(outs['max'] > (outs['75%']+3*IQR))
# just display the features containing extreme outliers
outs.loc[outs.outliers]
# I am acknowledging the presence of outliers here, however, they fall in the cols
# that have low correlation to what I am trying to predict (TEMP). So, I have chosen
# to let them be right now because they will be eliminated later on down the page
## Add in the previous 3 day columns to have a 1,2,3 for each variable. Cannot predict weather for a certain day ##
## based on its own temperature readings we need the past two days. ##
N = data.shape[0]
# Start with 1
data['TEMP 1'] = data['TEMP'].shift(periods=1)
data['DEWP 1'] = data['DEWP'].shift(periods=1)
data['SLP 1'] = data['SLP'].shift(periods=1)
data['STP 1'] = data['STP'].shift(periods=1)
data['VISIB 1'] = data['VISIB'].shift(periods=1)
data['WDSP 1'] = data['WDSP'].shift(periods=1)
data['MXSPD 1'] = data['MXSPD'].shift(periods=1)
data['GUST 1'] = data['GUST'].shift(periods=1)
data['MAX 1'] = data['MAX'].shift(periods=1)
data['MIN 1'] = data['MIN'].shift(periods=1)
data['PRCP 1'] = data['PRCP'].shift(periods=1)
data['SNDP 1'] = data['SNDP'].shift(periods=1)
# 2
data['TEMP 2'] = data['TEMP'].shift(periods=2)
data['DEWP 2'] = data['DEWP'].shift(periods=2)
data['SLP 2'] = data['SLP'].shift(periods=2)
data['STP 2'] = data['STP'].shift(periods=2)
data['VISIB 2'] = data['VISIB'].shift(periods=2)
data['WDSP 2'] = data['WDSP'].shift(periods=2)
data['MXSPD 2'] = data['MXSPD'].shift(periods=2)
data['GUST 2'] = data['GUST'].shift(periods=2)
data['MAX 2'] = data['MAX'].shift(periods=2)
data['MIN 2'] = data['MIN'].shift(periods=2)
data['PRCP 2'] = data['PRCP'].shift(periods=2)
data['SNDP 2'] = data['SNDP'].shift(periods=2)
# 3
data['TEMP 3'] = data['TEMP'].shift(periods=3)
data['DEWP 3'] = data['DEWP'].shift(periods=3)
data['SLP 3'] = data['SLP'].shift(periods=3)
data['STP 3'] = data['STP'].shift(periods=3)
data['VISIB 3'] = data['VISIB'].shift(periods=3)
data['WDSP 3'] = data['WDSP'].shift(periods=3)
data['MXSPD 3'] = data['MXSPD'].shift(periods=3)
data['GUST 3'] = data['GUST'].shift(periods=3)
data['MAX 3'] = data['MAX'].shift(periods=3)
data['MIN 3'] = data['MIN'].shift(periods=3)
data['PRCP 3'] = data['PRCP'].shift(periods=3)
data['SNDP 3'] = data['SNDP'].shift(periods=3)
data.columns
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['DEWP 3', 'SLP 3', 'STP 3', 'VISIB 3',
'WDSP 3', 'MXSPD 3', 'GUST 3', 'MAX 3', 'MIN 3', 'PRCP 3', 'SNDP 3'], y_vars='TEMP', height=7, aspect=0.7)
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['WDSP 2', 'MXSPD 2', 'GUST 2', 'MAX 2', 'MIN 2',
'PRCP 2', 'SNDP 2', 'TEMP 3'], y_vars='TEMP', height=7, aspect=0.7)
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['MAX 1', 'MIN 1', 'PRCP 1', 'SNDP 1', 'TEMP 2', 'DEWP 2', 'SLP 2',
'STP 2', 'VISIB 2'], y_vars='TEMP', height=7, aspect=0.7)
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['TEMP','MAX', 'MIN', 'TEMP 1',
'DEWP 1', 'SLP 1', 'STP 1', 'VISIB 1', 'WDSP 1', 'MXSPD 1', 'GUST 1'], y_vars='TEMP', height=7, aspect=0.7)
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=["TEMP 1", "TEMP 2", "TEMP 3", "MIN 1", "MIN 2", "MIN 3", "DEWP 1", "DEWP 2", "DEWP 3", "MAX 1",
"MAX 2", "MAX 3"], y_vars='TEMP', height=7, aspect=0.7)
# Standardize data #
#drop date and name not needed anymore really
drop_col = [0,2]
temp_data = data.drop(data.columns[drop_col],axis=1)
Z = (temp_data.iloc[0:temp_data.shape[0], 0:temp_data.shape[1]] - temp_data.iloc[0:temp_data.shape[0], 0:temp_data.shape[1]].mean()) / temp_data.iloc[0:temp_data.shape[0], 0:temp_data.shape[1]].std()
Z.head()
## Correlation is determined to see what values are most related to the Mean Temperature for the day ##
Z.corr()[['TEMP']].sort_values('TEMP')
# using the charts above and the correlation get to eliminate un-needed cols #
drop_cols = []
for col in range (0, len(data.columns)):
if (data.columns[col] not in ['TEMP','MAX', 'MIN', 'TEMP 1',
'DEWP 1', 'SLP 1', 'STP 1', 'VISIB 1', 'WDSP 1', 'MXSPD 1', 'GUST 1',
'MAX 1', 'MIN 1', 'PRCP 1', 'SNDP 1', 'TEMP 2', 'DEWP 2', 'SLP 2',
'STP 2', 'VISIB 2', 'WDSP 2', 'MXSPD 2', 'GUST 2', 'MAX 2', 'MIN 2',
'PRCP 2', 'SNDP 2', 'TEMP 3', 'DEWP 3', 'SLP 3', 'STP 3', 'VISIB 3',
'WDSP 3', 'MXSPD 3', 'GUST 3', 'MAX 3', 'MIN 3', 'PRCP 3', 'SNDP 3']):
drop_cols.append(col);
data.drop(data.columns[drop_cols],axis=1,inplace=True)
data.head()
# my predictors for regression
# TEMP 1, TEMP 2, TEMP 3, MIN 1, MIN 2, MIN 3, DEWP 1, DEWP 2, DEWP 3, MAX 1, MAX 2, MAX 3
# trim my dataset down to these now and the temp that I want
print(data.shape)
data = data.dropna()
print(data.shape)
value_wanted = data["TEMP"]
cols_wanted = ["TEMP 1", "TEMP 2", "TEMP 3", "MIN 1", "MIN 2", "MIN 3", "DEWP 1", "DEWP 2", "DEWP 3", "MAX 1",
"MAX 2", "MAX 3"]
preds = data[cols_wanted]
# Data pre-processing finally done and ready to apply this data to our linear regression model to predicte TEMP #
X_train, X_test, y_train, y_test = train_test_split(preds, value_wanted, test_size=0.2, random_state=12)
# instantiate the regressor class
regressor = LinearRegression()
# fit the build the model by fitting the regressor to the training data
regressor.fit(X_train, y_train)
# make a prediction set using the test set
prediction = regressor.predict(X_test)
# Evaluate the prediction accuracy of the model
#https://scikit-learn.org/stable/modules/classes.html#regression-metrics
from sklearn.metrics import mean_absolute_error, median_absolute_error
print("The Explained Variance: %.2f" % regressor.score(X_test, y_test))
print("The Mean Absolute Error: %.2f degrees celsius" % mean_absolute_error(y_test, prediction))
print("The Median Absolute Error: %.2f degrees celsius" % median_absolute_error(y_test, prediction))
print(prediction)